<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>

<meta charset="utf-8">
<meta name="generator" content="quarto-1.2.198">

<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">


<title>R语言医学数据科学入门 - 3&nbsp; Tidy data</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
  width: 0.8em;
  margin: 0 0.8em 0.2em -1.6em;
  vertical-align: middle;
}
</style>


<script src="site_libs/quarto-nav/quarto-nav.js"></script>
<script src="site_libs/quarto-nav/headroom.min.js"></script>
<script src="site_libs/clipboard/clipboard.min.js"></script>
<script src="site_libs/quarto-search/autocomplete.umd.js"></script>
<script src="site_libs/quarto-search/fuse.min.js"></script>
<script src="site_libs/quarto-search/quarto-search.js"></script>
<meta name="quarto:offset" content="./">
<link href="./04-data IO.html" rel="next">
<link href="./02-Data types.html" rel="prev">
<script src="site_libs/quarto-html/quarto.js"></script>
<script src="site_libs/quarto-html/popper.min.js"></script>
<script src="site_libs/quarto-html/tippy.umd.min.js"></script>
<script src="site_libs/quarto-html/anchor.min.js"></script>
<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">
<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="site_libs/bootstrap/bootstrap.min.js"></script>
<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
<script id="quarto-search-options" type="application/json">{
  "location": "sidebar",
  "copy-button": false,
  "collapse-after": 3,
  "panel-placement": "start",
  "type": "textbox",
  "limit": 20,
  "language": {
    "search-no-results-text": "No results",
    "search-matching-documents-text": "matching documents",
    "search-copy-link-title": "Copy link to search",
    "search-hide-matches-text": "Hide additional matches",
    "search-more-match-text": "more match in this document",
    "search-more-matches-text": "more matches in this document",
    "search-clear-button-title": "Clear",
    "search-detached-cancel-button-title": "Cancel",
    "search-submit-button-title": "Submit"
  }
}</script>

<link href="site_libs/pagedtable-1.1/css/pagedtable.css" rel="stylesheet">
<script src="site_libs/pagedtable-1.1/js/pagedtable.js"></script>


</head>

<body class="nav-sidebar floating">

<div id="quarto-search-results"></div>
  <header id="quarto-header" class="headroom fixed-top">
  <nav class="quarto-secondary-nav" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
    <div class="container-fluid d-flex justify-content-between">
      <h1 class="quarto-secondary-nav-title"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Tidy data</span></h1>
      <button type="button" class="quarto-btn-toggle btn" aria-label="Show secondary navigation">
        <i class="bi bi-chevron-right"></i>
      </button>
    </div>
  </nav>
</header>
<!-- content -->
<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
<!-- sidebar -->
  <nav id="quarto-sidebar" class="sidebar collapse sidebar-navigation floating overflow-auto">
    <div class="pt-lg-2 mt-2 text-left sidebar-header">
    <div class="sidebar-title mb-0 py-0">
      <a href="./">R语言医学数据科学入门</a> 
        <div class="sidebar-tools-main">
    <a href="" title="Share" id="sidebar-tool-dropdown-0" class="sidebar-tool dropdown-toggle px-1" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi bi-share"></i></a>
    <ul class="dropdown-menu" aria-labelledby="sidebar-tool-dropdown-0">
        <li>
          <a class="dropdown-item sidebar-tools-main-item" href="https://twitter.com/intent/tweet?url=|url|">
            <i class="bi bi-bi-twitter pe-1"></i>
          Twitter
          </a>
        </li>
        <li>
          <a class="dropdown-item sidebar-tools-main-item" href="https://www.facebook.com/sharer/sharer.php?u=|url|">
            <i class="bi bi-bi-facebook pe-1"></i>
          Facebook
          </a>
        </li>
    </ul>
</div>
    </div>
      </div>
      <div class="mt-2 flex-shrink-0 align-items-center">
        <div class="sidebar-search">
        <div id="quarto-search" class="" title="Search"></div>
        </div>
      </div>
    <div class="sidebar-menu-container"> 
    <ul class="list-unstyled mt-1">
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./index.html" class="sidebar-item-text sidebar-link">Preface</a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./intro.html" class="sidebar-item-text sidebar-link">Introduction</a>
  </div>
</li>
        <li class="sidebar-item sidebar-item-section">
      <div class="sidebar-item-container"> 
            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">Data things</a>
          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
            <i class="bi bi-chevron-right ms-2"></i>
          </a> 
      </div>
      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./01-hello_data.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Hello data</span></a>
  </div>
</li>
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./02-Data types.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">数据结构和类型</span></a>
  </div>
</li>
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./03-tidy_data.html" class="sidebar-item-text sidebar-link active"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Tidy data</span></a>
  </div>
</li>
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./04-data IO.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Data I/O</span></a>
  </div>
</li>
      </ul>
  </li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./tips.html" class="sidebar-item-text sidebar-link">Tips</a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./references.html" class="sidebar-item-text sidebar-link">References</a>
  </div>
</li>
    </ul>
    </div>
</nav>
<!-- margin-sidebar -->
    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
        <nav id="TOC" role="doc-toc" class="toc-active">
    <h2 id="toc-title">Table of contents</h2>
   
  <ul>
  <li><a href="#column-variable" id="toc-column-variable" class="nav-link active" data-scroll-target="#column-variable"><span class="toc-section-number">3.1</span>  column-variable</a></li>
  <li><a href="#row-observation" id="toc-row-observation" class="nav-link" data-scroll-target="#row-observation"><span class="toc-section-number">3.2</span>  row-observation</a></li>
  <li><a href="#cell-value" id="toc-cell-value" class="nav-link" data-scroll-target="#cell-value"><span class="toc-section-number">3.3</span>  cell-value</a></li>
  <li><a href="#烂数据长啥样" id="toc-烂数据长啥样" class="nav-link" data-scroll-target="#烂数据长啥样"><span class="toc-section-number">3.4</span>  烂数据长啥样？</a></li>
  <li><a href="#tidy-data-rectangular-data" id="toc-tidy-data-rectangular-data" class="nav-link" data-scroll-target="#tidy-data-rectangular-data"><span class="toc-section-number">3.5</span>  Tidy Data = rectangular data</a></li>
  <li><a href="#有什么好处" id="toc-有什么好处" class="nav-link" data-scroll-target="#有什么好处"><span class="toc-section-number">3.6</span>  有什么好处</a></li>
  </ul>
</nav>
    </div>
<!-- main -->
<main class="content" id="quarto-document-content">

<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title d-none d-lg-block"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Tidy data</span></h1>
</div>



<div class="quarto-title-meta">

    
  
    
  </div>
  

</header>

<div class="cell">

</div>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="image/03/Hadley_Wickham.jpg" class="img-fluid figure-img" width="600"></p>
<p></p><figcaption class="figure-caption">Hadley Wickham镇楼</figcaption><p></p>
</figure>
</div>
<p>这个题目真的很难讲，因为Hadley Wickham只用了三句非常简短的话概括了什么是 Tidy data（整洁数据）。 这三句话也成为了R领域关于数据框类型数据的圣经。之所以又说难理解，因为话说简单了，就容易让不同人有不同的理解。 有时候要相信大道至简，你越是想用更多的话去描述它，就越描述不清。</p>
<p><a href="https://r4ds.had.co.nz/tidy-data.html">Tidy data | R for Data Science</a></p>
<div class="callout-note callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Note
</div>
</div>
<div class="callout-body-container callout-body">
<ul>
<li>Each variable must have its own column.</li>
<li>Each observation must have its own row.</li>
<li>Each value must have its own cell.</li>
</ul>
</div>
</div>
<p>其实这句话也有很多变种，但无论怎么变，都是在表达一个意思</p>
<p><a href="https://tidyr.tidyverse.org/articles/tidy-data.html">Tidy data • tidyr</a></p>
<ul>
<li>Every column is a variable.</li>
<li>Every row is an observation.</li>
<li>Every cell is a single value.</li>
</ul>
<p><a href="https://www.openscapes.org/blog/2020/10/12/tidy-data/">Tidy data for efficiency, reproducibility, and collaboration</a></p>
<ul>
<li>Each variable forms a column.</li>
<li>Each observation forms a row.</li>
<li>Each cell is a single measurement.</li>
</ul>
<p>这是一种表格数据组织的规范，这个规范告诉我们，看起来都是表，但不一定符合标准，不能直接用来分析。 Hadley Wickham曾经效仿俄罗斯著名文豪Leo Tolstoy（托尔斯泰）的一句名言</p>
<blockquote class="blockquote">
<p>Happy families are all alike; every unhappy family is unhappy in its own way — Leo Tolstoy</p>
</blockquote>
<p>Like families, <code>tidy datasets are all alike but every messy dataset is messy in its own way.</code></p>
<p><strong>整洁的表格数据都差不多，但烂表格数据都烂的各不相同</strong></p>
<section id="column-variable" class="level2" data-number="3.1">
<h2 data-number="3.1" class="anchored" data-anchor-id="column-variable"><span class="header-section-number">3.1</span> column-variable</h2>
<blockquote class="blockquote">
<p>Each variable must have its own column. 每一个变量都有自己的列</p>
</blockquote>
<p>在理解这一句时，首先要搞懂，什么是<code>variable</code>。作为医学研究，我们可以理解为研究对象的属性(attribute)，比如，病人的年龄、性别、收入等。<code>variable</code>是根据你要做的研究业务需求而设立的，比如最常见的RCT研究，不同的治疗方法或者说治疗组+对照组一起构成了group这个<code>variable</code>(列)。<code>variable</code>在中文也被称为字段。</p>
<p>variable根据业务需要，还可分为两类</p>
<ol type="1">
<li>自变量（X），也被称为input, predictor, (independent) variable, regressor, covariate, feature, explanatory variable</li>
<li>因变量（Y），也被称为dependent variable, response, outcome, target, output, response variable</li>
</ol>
<p>比如，下面这个数据，如果我们把是否早产（premature）作为要预测的Y（response），其他变量就可以定为X（predictors）</p>
<div class="cell">
<div class="cell-output-display">
<div id="tbl-birth" class="anchored">

<div data-pagedtable="false"> <div class="table-caption">Table&nbsp;3.1:  Birth table </div>
  <script data-pagedtable-source="" type="application/json">
{"columns":[{"label":["f_age"],"name":[1],"type":["int"],"align":["right"]},{"label":["m_age"],"name":[2],"type":["int"],"align":["right"]},{"label":["weeks"],"name":[3],"type":["int"],"align":["right"]},{"label":["premature"],"name":[4],"type":["fct"],"align":["left"]},{"label":["visits"],"name":[5],"type":["int"],"align":["right"]},{"label":["gained"],"name":[6],"type":["int"],"align":["right"]},{"label":["weight"],"name":[7],"type":["dbl"],"align":["right"]},{"label":["sex_baby"],"name":[8],"type":["fct"],"align":["left"]},{"label":["smoke"],"name":[9],"type":["fct"],"align":["left"]}],"data":[{"1":"31","2":"30","3":"39","4":"full term","5":"13","6":"1","7":"6.88","8":"male","9":"smoker"},{"1":"34","2":"36","3":"39","4":"full term","5":"5","6":"35","7":"7.69","8":"male","9":"nonsmoker"},{"1":"36","2":"35","3":"40","4":"full term","5":"12","6":"29","7":"8.88","8":"male","9":"nonsmoker"},{"1":"41","2":"40","3":"40","4":"full term","5":"13","6":"30","7":"9.00","8":"female","9":"nonsmoker"},{"1":"42","2":"37","3":"40","4":"full term","5":"NA","6":"10","7":"7.94","8":"male","9":"nonsmoker"},{"1":"37","2":"28","3":"40","4":"full term","5":"12","6":"35","7":"8.25","8":"male","9":"smoker"},{"1":"35","2":"35","3":"28","4":"premie","5":"6","6":"29","7":"1.63","8":"female","9":"nonsmoker"},{"1":"28","2":"21","3":"35","4":"premie","5":"9","6":"15","7":"5.50","8":"female","9":"smoker"},{"1":"22","2":"20","3":"32","4":"premie","5":"5","6":"40","7":"2.69","8":"male","9":"smoker"},{"1":"36","2":"25","3":"40","4":"full term","5":"13","6":"34","7":"8.75","8":"female","9":"nonsmoker"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
  </script>
</div>
</div>
</div>
</div>
<div class="callout-note callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Note
</div>
</div>
<div class="callout-body-container callout-body">
<p>复习：前面我们知道变量里的值可以是计量资料和分类资料，对应统计学模型就是回归问题或分类问题。</p>
<ul>
<li>In the regression problem, Y is numeric (e.g price, blood pressure).</li>
<li>In the classification problem, Y takes values in a finite, unordered set (survived/died, cancer class of tissue sample).</li>
</ul>
</div>
</div>
</section>
<section id="row-observation" class="level2" data-number="3.2">
<h2 data-number="3.2" class="anchored" data-anchor-id="row-observation"><span class="header-section-number">3.2</span> row-observation</h2>
<blockquote class="blockquote">
<p>Each observation must have its own row. 每一条记录都有自己行。</p>
</blockquote>
<p>observation也有很多同义词，如case, example, instance, record, pattern, sample。每一条记录都是独立的，都代表一个个体。也可以参考刚才的表格，一个记录就是一个新生儿 <a href="#tbl-birth">Table&nbsp;<span>3.1</span></a>。</p>
</section>
<section id="cell-value" class="level2" data-number="3.3">
<h2 data-number="3.3" class="anchored" data-anchor-id="cell-value"><span class="header-section-number">3.3</span> cell-value</h2>
<blockquote class="blockquote">
<p>Each value must have its own cell.一个单元格只能有一个值。</p>
</blockquote>
<p>其实这个第三条，可以不要，因为行和列就可以确定一个坐标，也就是一个单元格，行和列没有问题，单元格也就不会有问题。 这里的问题主要出现在单元格里数据格式不统一，如，下表alcgp列有些加了g/day，，有些没有。</p>
<div class="cell">
<div class="cell-output-display">

<div data-pagedtable="false">
  <script data-pagedtable-source="" type="application/json">
{"columns":[{"label":["agegp"],"name":[1],"type":["ord"],"align":["right"]},{"label":["alcgp"],"name":[2],"type":["ord"],"align":["right"]},{"label":["tobgp"],"name":[3],"type":["ord"],"align":["right"]},{"label":["ncases"],"name":[4],"type":["dbl"],"align":["right"]},{"label":["ncontrols"],"name":[5],"type":["dbl"],"align":["right"]}],"data":[{"1":"25-34","2":"0-39g/day","3":"0-9g/day","4":"0","5":"40"},{"1":"25-34","2":"0-39g/day","3":"10-19","4":"0","5":"10"},{"1":"25-34","2":"0-39g/day","3":"20-29","4":"0","5":"6"},{"1":"25-34","2":"0-39g/day","3":"30+","4":"0","5":"5"},{"1":"25-34","2":"40-79","3":"0-9g/day","4":"0","5":"27"},{"1":"25-34","2":"40-79","3":"10-19","4":"0","5":"7"},{"1":"25-34","2":"40-79","3":"20-29","4":"0","5":"4"},{"1":"25-34","2":"40-79","3":"30+","4":"0","5":"7"},{"1":"25-34","2":"80-119","3":"0-9g/day","4":"0","5":"2"},{"1":"25-34","2":"80-119","3":"10-19","4":"0","5":"1"},{"1":"25-34","2":"80-119","3":"30+","4":"0","5":"2"},{"1":"25-34","2":"120+","3":"0-9g/day","4":"0","5":"1"},{"1":"25-34","2":"120+","3":"10-19","4":"1","5":"0"},{"1":"25-34","2":"120+","3":"20-29","4":"0","5":"1"},{"1":"25-34","2":"120+","3":"30+","4":"0","5":"2"},{"1":"35-44","2":"0-39g/day","3":"0-9g/day","4":"0","5":"60"},{"1":"35-44","2":"0-39g/day","3":"10-19","4":"1","5":"13"},{"1":"35-44","2":"0-39g/day","3":"20-29","4":"0","5":"7"},{"1":"35-44","2":"0-39g/day","3":"30+","4":"0","5":"8"},{"1":"35-44","2":"40-79","3":"0-9g/day","4":"0","5":"35"},{"1":"35-44","2":"40-79","3":"10-19","4":"3","5":"20"},{"1":"35-44","2":"40-79","3":"20-29","4":"1","5":"13"},{"1":"35-44","2":"40-79","3":"30+","4":"0","5":"8"},{"1":"35-44","2":"80-119","3":"0-9g/day","4":"0","5":"11"},{"1":"35-44","2":"80-119","3":"10-19","4":"0","5":"6"},{"1":"35-44","2":"80-119","3":"20-29","4":"0","5":"2"},{"1":"35-44","2":"80-119","3":"30+","4":"0","5":"1"},{"1":"35-44","2":"120+","3":"0-9g/day","4":"2","5":"1"},{"1":"35-44","2":"120+","3":"10-19","4":"0","5":"3"},{"1":"35-44","2":"120+","3":"20-29","4":"2","5":"2"},{"1":"45-54","2":"0-39g/day","3":"0-9g/day","4":"1","5":"45"},{"1":"45-54","2":"0-39g/day","3":"10-19","4":"0","5":"18"},{"1":"45-54","2":"0-39g/day","3":"20-29","4":"0","5":"10"},{"1":"45-54","2":"0-39g/day","3":"30+","4":"0","5":"4"},{"1":"45-54","2":"40-79","3":"0-9g/day","4":"6","5":"32"},{"1":"45-54","2":"40-79","3":"10-19","4":"4","5":"17"},{"1":"45-54","2":"40-79","3":"20-29","4":"5","5":"10"},{"1":"45-54","2":"40-79","3":"30+","4":"5","5":"2"},{"1":"45-54","2":"80-119","3":"0-9g/day","4":"3","5":"13"},{"1":"45-54","2":"80-119","3":"10-19","4":"6","5":"8"},{"1":"45-54","2":"80-119","3":"20-29","4":"1","5":"4"},{"1":"45-54","2":"80-119","3":"30+","4":"2","5":"2"},{"1":"45-54","2":"120+","3":"0-9g/day","4":"4","5":"0"},{"1":"45-54","2":"120+","3":"10-19","4":"3","5":"1"},{"1":"45-54","2":"120+","3":"20-29","4":"2","5":"1"},{"1":"45-54","2":"120+","3":"30+","4":"4","5":"0"},{"1":"55-64","2":"0-39g/day","3":"0-9g/day","4":"2","5":"47"},{"1":"55-64","2":"0-39g/day","3":"10-19","4":"3","5":"19"},{"1":"55-64","2":"0-39g/day","3":"20-29","4":"3","5":"9"},{"1":"55-64","2":"0-39g/day","3":"30+","4":"4","5":"2"},{"1":"55-64","2":"40-79","3":"0-9g/day","4":"9","5":"31"},{"1":"55-64","2":"40-79","3":"10-19","4":"6","5":"15"},{"1":"55-64","2":"40-79","3":"20-29","4":"4","5":"13"},{"1":"55-64","2":"40-79","3":"30+","4":"3","5":"3"},{"1":"55-64","2":"80-119","3":"0-9g/day","4":"9","5":"9"},{"1":"55-64","2":"80-119","3":"10-19","4":"8","5":"7"},{"1":"55-64","2":"80-119","3":"20-29","4":"3","5":"3"},{"1":"55-64","2":"80-119","3":"30+","4":"4","5":"0"},{"1":"55-64","2":"120+","3":"0-9g/day","4":"5","5":"5"},{"1":"55-64","2":"120+","3":"10-19","4":"6","5":"1"},{"1":"55-64","2":"120+","3":"20-29","4":"2","5":"1"},{"1":"55-64","2":"120+","3":"30+","4":"5","5":"1"},{"1":"65-74","2":"0-39g/day","3":"0-9g/day","4":"5","5":"43"},{"1":"65-74","2":"0-39g/day","3":"10-19","4":"4","5":"10"},{"1":"65-74","2":"0-39g/day","3":"20-29","4":"2","5":"5"},{"1":"65-74","2":"0-39g/day","3":"30+","4":"0","5":"2"},{"1":"65-74","2":"40-79","3":"0-9g/day","4":"17","5":"17"},{"1":"65-74","2":"40-79","3":"10-19","4":"3","5":"7"},{"1":"65-74","2":"40-79","3":"20-29","4":"5","5":"4"},{"1":"65-74","2":"80-119","3":"0-9g/day","4":"6","5":"7"},{"1":"65-74","2":"80-119","3":"10-19","4":"4","5":"8"},{"1":"65-74","2":"80-119","3":"20-29","4":"2","5":"1"},{"1":"65-74","2":"80-119","3":"30+","4":"1","5":"0"},{"1":"65-74","2":"120+","3":"0-9g/day","4":"3","5":"1"},{"1":"65-74","2":"120+","3":"10-19","4":"1","5":"1"},{"1":"65-74","2":"120+","3":"20-29","4":"1","5":"0"},{"1":"65-74","2":"120+","3":"30+","4":"1","5":"0"},{"1":"75+","2":"0-39g/day","3":"0-9g/day","4":"1","5":"17"},{"1":"75+","2":"0-39g/day","3":"10-19","4":"2","5":"4"},{"1":"75+","2":"0-39g/day","3":"30+","4":"1","5":"2"},{"1":"75+","2":"40-79","3":"0-9g/day","4":"2","5":"3"},{"1":"75+","2":"40-79","3":"10-19","4":"1","5":"2"},{"1":"75+","2":"40-79","3":"20-29","4":"0","5":"3"},{"1":"75+","2":"40-79","3":"30+","4":"1","5":"0"},{"1":"75+","2":"80-119","3":"0-9g/day","4":"1","5":"0"},{"1":"75+","2":"80-119","3":"10-19","4":"1","5":"0"},{"1":"75+","2":"120+","3":"0-9g/day","4":"2","5":"0"},{"1":"75+","2":"120+","3":"10-19","4":"1","5":"0"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
  </script>
</div>
</div>
</div>
</section>
<section id="烂数据长啥样" class="level2" data-number="3.4">
<h2 data-number="3.4" class="anchored" data-anchor-id="烂数据长啥样"><span class="header-section-number">3.4</span> 烂数据长啥样？</h2>
<p>举几个例子</p>
<p><a href="https://towardsdatascience.com/whats-tidy-data-how-to-organize-messy-datasets-in-python-with-melt-and-pivotable-functions-5d52daa996c9">What’s Tidy Data?. How to organize messy datasets in| Towards Data Science</a></p>
<ol type="1">
<li>Column headers are values, not variable names.即把value当成了variable。如果你用过Graphpad这里软件，你就知道他们的数据格式是这样的</li>
</ol>
<div class="cell">
<div class="cell-output-display">

<div data-pagedtable="false">
  <script data-pagedtable-source="" type="application/json">
{"columns":[{"label":["drugA"],"name":[1],"type":["dbl"],"align":["right"]},{"label":["drugB"],"name":[2],"type":["dbl"],"align":["right"]},{"label":["drugC"],"name":[3],"type":["dbl"],"align":["right"]}],"data":[{"1":"0.7","2":"0.5","3":"0.3"},{"1":"1.0","2":"0.7","3":"0.6"},{"1":"1.5","2":"0.9","3":"1.0"},{"1":"1.8","2":"1.3","3":"1.2"},{"1":"2.2","2":"1.8","3":"3.3"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
  </script>
</div>
</div>
</div>
<p>这就是典型的把value当成了variable，实质上，应该变成2列。我们要研究的是不同组别（group）这个变量，体重增长水平有没有差异.</p>
<p>正确的表格，应该这样</p>
<div class="cell">
<div class="cell-output-display">

<div data-pagedtable="false">
  <script data-pagedtable-source="" type="application/json">
{"columns":[{"label":["group"],"name":[1],"type":["chr"],"align":["left"]},{"label":["weight"],"name":[2],"type":["dbl"],"align":["right"]}],"data":[{"1":"drugA","2":"0.7"},{"1":"drugA","2":"1.0"},{"1":"drugA","2":"1.5"},{"1":"drugA","2":"1.8"},{"1":"drugA","2":"2.2"},{"1":"drugB","2":"0.5"},{"1":"drugB","2":"0.7"},{"1":"drugB","2":"0.9"},{"1":"drugB","2":"1.3"},{"1":"drugB","2":"1.8"},{"1":"drugC","2":"0.3"},{"1":"drugC","2":"0.6"},{"1":"drugC","2":"1.0"},{"1":"drugC","2":"1.2"},{"1":"drugC","2":"3.3"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
  </script>
</div>
</div>
</div>
<ol start="2" type="1">
<li>Multiple variables are stored in one column.多个变量共享一列</li>
</ol>
<div class="cell">
<div class="cell-output-display">

<div data-pagedtable="false">
  <script data-pagedtable-source="" type="application/json">
{"columns":[{"label":["sex_age"],"name":[1],"type":["chr"],"align":["left"]},{"label":["value"],"name":[2],"type":["dbl"],"align":["right"]}],"data":[{"1":"m7","2":"0.5"},{"1":"f9","2":"0.7"},{"1":"f8","2":"0.9"},{"1":"m8","2":"1.3"},{"1":"f8","2":"1.8"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
  </script>
</div>
</div>
</div>
</section>
<section id="tidy-data-rectangular-data" class="level2" data-number="3.5">
<h2 data-number="3.5" class="anchored" data-anchor-id="tidy-data-rectangular-data"><span class="header-section-number">3.5</span> Tidy Data = rectangular data</h2>
<p>如果你想快速判断是不是tidy data</p>
<p>只要确定三个唯一，一列对应唯一一个变量，一行对应唯一一个记录，单元格里只有一个值。 Tidy Data = rectangular data，Tidy Data必须是方形的。</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="image/03/nonrectang.png" class="img-fluid figure-img"></p>
<p></p><figcaption class="figure-caption">这个数据不是方形的</figcaption><p></p>
</figure>
</div>
</section>
<section id="有什么好处" class="level2" data-number="3.6">
<h2 data-number="3.6" class="anchored" data-anchor-id="有什么好处"><span class="header-section-number">3.6</span> 有什么好处</h2>
<p>Tidy data 带来了什么？究竟有什么好处？</p>
<ol type="1">
<li>提高效率。所有人在录入数据和构建数据集时，尽量采取该规范，这样就可以尽量省去数据清洗的环节。即便数据确实一团糟，但也有了一个整理的方向和构架，而当前无论是<code>tidyverse</code>系列的工具还是<code>data.table</code>这样的工具，都是为了tidydata设计的。对于不同的软件，如果都采取该规范，在软件之间的互通也方便，比如spss数据确实可以直接在R进行分析，但Graphpad的数据就无法直接在R使用。</li>
<li>便于协作。当前是大数据时代，数据往往都是海量。数据在采集时，都是分工协作，采集不同的信息，大家都遵循该规范时，在数据整合时就可以无缝衔接。而且，有时候可能有多个表格，需要合并，符合tidy data规范，就按某个字段直接合并即可。</li>
<li>便于数据开源。科学研究，重要的是可重复性和复用，有了Tidy data，科研更透明，任何人都可对数据进行更新、合并，为我所用。</li>
</ol>


</section>

</main> <!-- /main -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const toggleBodyColorMode = (bsSheetEl) => {
    const mode = bsSheetEl.getAttribute("data-mode");
    const bodyEl = window.document.querySelector("body");
    if (mode === "dark") {
      bodyEl.classList.add("quarto-dark");
      bodyEl.classList.remove("quarto-light");
    } else {
      bodyEl.classList.add("quarto-light");
      bodyEl.classList.remove("quarto-dark");
    }
  }
  const toggleBodyColorPrimary = () => {
    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
    if (bsSheetEl) {
      toggleBodyColorMode(bsSheetEl);
    }
  }
  toggleBodyColorPrimary();  
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const clipboard = new window.ClipboardJS('.code-copy-button', {
    target: function(trigger) {
      return trigger.previousElementSibling;
    }
  });
  clipboard.on('success', function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    var currentTitle = button.getAttribute("title");
    button.setAttribute("title", "Copied!");
    let tooltip;
    if (window.bootstrap) {
      button.setAttribute("data-bs-toggle", "tooltip");
      button.setAttribute("data-bs-placement", "left");
      button.setAttribute("data-bs-title", "Copied!");
      tooltip = new bootstrap.Tooltip(button, 
        { trigger: "manual", 
          customClass: "code-copy-button-tooltip",
          offset: [0, -8]});
      tooltip.show();    
    }
    setTimeout(function() {
      if (tooltip) {
        tooltip.hide();
        button.removeAttribute("data-bs-title");
        button.removeAttribute("data-bs-toggle");
        button.removeAttribute("data-bs-placement");
      }
      button.setAttribute("title", currentTitle);
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  });
  function tippyHover(el, contentFn) {
    const config = {
      allowHTML: true,
      content: contentFn,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start'
    };
    window.tippy(el, config); 
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      // use id or data attribute instead here
      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      return note.innerHTML;
    });
  }
  const findCites = (el) => {
    const parentEl = el.parentElement;
    if (parentEl) {
      const cites = parentEl.dataset.cites;
      if (cites) {
        return {
          el,
          cites: cites.split(' ')
        };
      } else {
        return findCites(el.parentElement)
      }
    } else {
      return undefined;
    }
  };
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const citeInfo = findCites(ref);
    if (citeInfo) {
      tippyHover(citeInfo.el, function() {
        var popup = window.document.createElement('div');
        citeInfo.cites.forEach(function(cite) {
          var citeDiv = window.document.createElement('div');
          citeDiv.classList.add('hanging-indent');
          citeDiv.classList.add('csl-entry');
          var biblioDiv = window.document.getElementById('ref-' + cite);
          if (biblioDiv) {
            citeDiv.innerHTML = biblioDiv.innerHTML;
          }
          popup.appendChild(citeDiv);
        });
        return popup.innerHTML;
      });
    }
  }
});
</script>
<nav class="page-navigation">
  <div class="nav-page nav-page-previous">
      <a href="./02-Data types.html" class="pagination-link">
        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">数据结构和类型</span></span>
      </a>          
  </div>
  <div class="nav-page nav-page-next">
      <a href="./04-data IO.html" class="pagination-link">
        <span class="nav-page-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Data I/O</span></span> <i class="bi bi-arrow-right-short"></i>
      </a>
  </div>
</nav>
</div> <!-- /content -->



</body></html>